clear
set more off
macro drop all
capture log close

/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection
Create Post Variables 

Created on: 2/26/2022
Last Modified on: 2/13/2024

Description: This do file creates outcome variables.

Note that we have removed the file directory names from this program for 
confidentiality reasons. 
********************************************************************************/

** Setting the Directory
global rawdata 
global cleandata
global tmp 
global jj 

/********************************************************************************

For children who were placed into foster care, the AFTER variables should be 
defined after the removal date. For children not placed into foster care, the 
AFTER variables should be defined after the complaint date (thereby measuring
after the child could have been placed in foster care).

I will create the following POST variables:

(1) ACADEMIC:
	-Age at Kindergarten
	-Attendance rate
	-Chronically absent
	-Special Ed
	-Low Income
	-Test scores (G3-G8, ACT/SAT)
	-Test score proficiency (G3-G8, ACT/SAT)
	-Repeated a grade
	-High School Graduation
	-Highest Grade Completed
	-College enrollment
	
(2) BEHAVIORAL:
	-Expelled
	-Suspended
	-Crime

(3) SCHOOL ENVIRONMENT:
	-Urbanicity
	-Charter
	-School size
	-Racial Composition in School
	-% Frpl in School

(4) NEIGHBORHOOD ENVIRONMENT:
	-# Different Census Blocks Lived in
	-Median Income
	-Employment Rate
	-% with BA or Higher
	-Racial Composition of Neighborhood
	-Homeless

(5) FUTURE MALTREATMENT
	-Investigation of Abuse/Neglect
	-Substantiated Investigation of Abuse/Neglect
	
I will think of some of these measures as time invariant (HS graduation)
and others as time varying (attendance rate).  For the time varying measure,
I want to create variables for both the AVERAGE measure across all POST observations and
also a variable for the FIRST measure of all POST observations (eg. track the average 
attendance rate of the child across all POST obs and also track their attendance
rate in the year just after they were removed/investigated).
	
*******************************************************************************/

**Load student*year panel
use "$cleandata/student_year_panel_withpre.dta", clear

**bring back in the date that a student exits the foster care system
sort vicid inv_caseid
tempfile old
save `old'

use "$cleandata/master_dhhs.dta", clear
sort vicid inv_caseid
keep vicid inv_caseid fc_enddt
merge 1:m vicid inv_caseid using `old'
drop if _merge==1
drop _merge


***********************
***(1) ACADEMIC POST MEASURES
***********************

*****AGE AT KINDERGARTEN
gen start=.
replace start=mdy(9,3,2002) if year==2003
replace start=mdy(9,2,2003) if year==2004
replace start=mdy(9,7,2004) if year==2005
replace start=mdy(9,6,2005) if year==2006
replace start=mdy(9,5,2006) if year==2007
replace start=mdy(9,4,2007) if year==2008
replace start=mdy(9,2,2008) if year==2009
replace start=mdy(9,8,2009) if year==2010
replace start=mdy(9,7,2010) if year==2011
replace start=mdy(9,6,2011) if year==2012
replace start=mdy(9,4,2012) if year==2013
replace start=mdy(9,3,2013) if year==2014
replace start=mdy(9,2,2014) if year==2015
replace start=mdy(9,8,2015) if year==2016
replace start=mdy(9,5,2016) if year==2017
la var start "First Day of School"

gen start1=.
replace start1=mdy(9,2,2003) if year==2003
replace start1=mdy(9,7,2004) if year==2004
replace start1=mdy(9,6,2005) if year==2005
replace start1=mdy(9,5,2006) if year==2006
replace start1=mdy(9,4,2007) if year==2007
replace start1=mdy(9,2,2008) if year==2008
replace start1=mdy(9,8,2009) if year==2009
replace start1=mdy(9,7,2010) if year==2010
replace start1=mdy(9,6,2011) if year==2011
replace start1=mdy(9,4,2012) if year==2012
replace start1=mdy(9,3,2013) if year==2013
replace start1=mdy(9,2,2014) if year==2014
replace start1=mdy(9,8,2015) if year==2015
replace start1=mdy(9,5,2016) if year==2016
replace start1=mdy(9,5,2017) if year==2017
la var start1 "First Day of School Next Year"

gen post_age_gk_tmp=start-birthdate if grade_fnl==0 & year>cps_year
replace post_age_gk_tmp=post_age_gk_tmp/365.25
replace post_age_gk_tmp=. if post_age_gk_tmp<=2 | post_age_gk_tmp>=8
gegen post_age_gk=min(post_age_gk_tmp), by(ric inv_caseid)
drop post_age_gk_tmp
la var post_age_gk "Age in Kindergarten"

**SPECIAL ED AND LOW INCOME
forv i=1/5 {
	gen post`i'_infc_annual_tmp=infc_annual if year==cps_sy+`i'
	gegen infc_post`i'_annual=max(post`i'_infc_annual_tmp), by(ric inv_caseid)
	drop post`i'_infc_annual_tmp
	la var infc_post`i'_annual "In Foster Care `i' SY After Investigation (for annual measures)"
}

gegen firstyear=min(year), by(ric)
gen sped_emotimp=0
foreach x in 3 4 5 6 14 {
	replace sped_emotimp=sped_disability==`x'
}
la var sped_emotimp "SPED- Mentally or Emotionally Impaired"
gen sped_ld=sped_disability==13
la var sped_ld "SPED- Learning Disability"
gen sped_physimp=(sped_disability==7 | sped_disability==8 | sped_disability==9 | sped_disability==17)
la var sped_physimp "SPED- Physical Disability (Hearing, Visual or Physical)"
gen sped_other_hlth=(sped_disability==20)
la var sped_other_hlth "SPED- Other Health Impairment"
gen sped_speech=sped_disability==10
la var sped_speech "SPED- Speech and Language Impaired"
gen sped_autism=sped_disability==15
la var sped_autism "Sped- Autism"
gen sped_other=(sped==1 & sped_emotimp==0 & sped_ld==0 & sped_physimp==0 & sped_other_hlth==0 & sped_speech==0 & sped_autism==0)
la var sped_other "SPED- Other"

foreach x in sped sped_emotimp sped_ld sped_physimp sped_other_hlth sped_speech sped_autism sped_other poor {
	forv i=1/5 {
		gen post`i'_`x'_tmp=`x' if year==cps_sy+`i'
		gegen post`i'_`x'=max(post`i'_`x'_tmp), by(ric inv_caseid)
		drop post`i'_`x'_tmp
	}
}

forv i=1/5 {
	la var post`i'_sped "SPED `i' SY After Investigation"
	la var post`i'_sped_emotimp "SPED- Emotionally Impaired `i' SY After Investigation"
	la var post`i'_sped_ld "SPED- Learning Disability `i' SY After Investigation"
	la var post`i'_sped_physimp "SPED- Physically Impaired `i' SY After Investigation"
	la var post`i'_sped_other_hlth "SPED- Other Health Impairment `i' SY After Investigation"
	la var post`i'_sped_speech "SPED- Speech/Language `i' SY After Investigation"
	la var post`i'_sped_autism "SPED- Autism `i' SY After Investigation"
	la var post`i'_sped_other "SPED- Other `i' SY After Investigation"
	la var post`i'_poor "Poor `i' SY After Investigation"
}

**ATTENDANCE RATE
forv i=1/5 {
	gen post`i'_attend_tmp=attend if year==cps_sy+`i'
	gegen post`i'_attend=max(post`i'_attend_tmp), by(ric inv_caseid)
	drop post`i'_attend_tmp
}

forv i=1/5 {
	la var post`i'_attend "Atd Rate `i' SY After Investigation"

}

*****TEST SCORES & TEST SCORE PROFICIENCY*****

**create variables for the last day of testing (testing occurred in Oct 
**from 2003-2014 and in April/May beginning in 2015)
gen testdt_month="10" if year<=2014 & inrange(grade_fnl,3,8)
replace testdt_month="5" if year>2014 & inrange(grade_fnl,3,8)
gen testdt_day="31" if inrange(grade_fnl,3,8)
gen testdt_year=year-1 if year<=2014 & inrange(grade_fnl,3,8)
replace testdt_year=year if year>2014 & inrange(grade_fnl,3,8)
tostring testdt_year, replace 
gen testdt_string=testdt_month + "-" + testdt_day + "-" + testdt_year
gen testdt=date(testdt_string, "MDY") 
drop testdt_month testdt_day testdt_year testdt_string

**Grade 3-8 exams
forvalues grade=3/8 {
	foreach subj in math reading {
	
		**Took MEAP/MSTEP
		gen post_took_`subj'_g`grade'_tmp=(`subj'stdss!=.) if grade_fnl==`grade' & year>cps_sy
		gegen post_took_`subj'_g`grade'=max(post_took_`subj'_g`grade'_tmp), by(ric inv_caseid)
		drop post_took_`subj'_g`grade'_tmp
		
		**MEAP/MSTEP score
		gen post_`subj'_g`grade'_tmp=`subj'stdss if post_took_`subj'_g`grade'==1 & grade_fnl==`grade' & year>cps_sy
		gegen post_`subj'_g`grade'=max(post_`subj'_g`grade'_tmp), by(ric inv_caseid)
		drop post_`subj'_g`grade'_tmp

		**MEAP/MSTEP proficiency
		gen post_`subj'prof_g`grade'_tmp=`subj'prof if post_took_`subj'_g`grade'==1 & grade_fnl==`grade' & year>cps_sy
		gegen post_`subj'prof_g`grade'=max(post_`subj'prof_g`grade'_tmp), by(ric inv_caseid)
		drop post_`subj'prof_g`grade'_tmp

	}
	
	la var post_took_math_g`grade' "Took Math MEAP/MSTEP in Grade `grade'"
	la var post_took_reading_g`grade' "Took Reading MEAP/MSTEP in Grade `grade'"	
	la var post_math_g`grade' "Std Score on Math MEAP/MSTEP in Grade `grade'"
	la var post_reading_g`grade' "Std Score on Reading MEAP/MSTEP in Grade `grade'"	
	la var post_mathprof_g`grade' "Proficient on Math MEAP/MSTEP in Grade `grade'"
	la var post_readingprof_g`grade' "Proficient on Reading MEAP/MSTEP in Grade `grade'"

}

**define what year is post1, post2, etc, given the different testing schedules
gen removal_month=month(removal_date)
gen removal_year=year(removal_date)

global start2003=mdy(9,3,2002)
global start2004=mdy(9,2,2003) 
global start2005=mdy(9,7,2004) 
global start2006=mdy(9,6,2005) 
global start2007=mdy(9,5,2006)
global start2008=mdy(9,4,2007)
global start2009=mdy(9,2,2008) 
global start2010=mdy(9,8,2009)
global start2011=mdy(9,7,2010) 
global start2012=mdy(9,6,2011) 
global start2013=mdy(9,4,2012) 
global start2014=mdy(9,3,2013) 
global start2015=mdy(9,2,2014) 
global start2016=mdy(9,8,2015) 
global start2017=mdy(9,5,2016) 

**note: some of the removal dates are in 2017, and since we don't yet have 2018 student
**level data, the outcomes will be missing for these students as of Sep 2018.
gen removal_sy=.
replace removal_sy=removal_year if removal_month>=1 & removal_month<=8 & removal_date!=.
forv i=2008/2016 {
	local i1=`i'+1
	replace removal_sy=`i' if removal_month==9 & removal_date<${start`i1'} & removal_year==`i'
	replace removal_sy=`i1' if removal_month==9 & removal_date>=${start`i1'} & removal_year==`i'
}
replace removal_sy=removal_year+1 if removal_month>=10 & removal_date!=.

gen flag_post1=1 if year==cps_sy+1 & fc==0 & cps_year<=2014 & inrange(cps_month,1,8)
replace flag_post1=1 if year==cps_sy & fc==0 & cps_year>2014 & inrange(cps_month,1,5)
replace flag_post1=1 if year==cps_sy+1 & fc==0 & cps_year>2014 & inrange(cps_month,6,8)
replace flag_post1=1 if year==removal_sy+1 & fc==0 & removal_year<=2014 & inrange(removal_month,1,8)
replace flag_post1=1 if year==removal_sy & fc==0 & removal_year>2014 & inrange(removal_month,1,5)
replace flag_post1=1 if year==removal_sy+1 & fc==0 & removal_year>2014 & inrange(removal_month,6,8)

replace flag_post1=1 if year==cps_sy & fc==0 & inrange(cps_month,9,10)
replace flag_post1=1 if year==removal_sy & fc==1 & inrange(cps_month,9,10)

replace flag_post1=1 if year==cps_sy+1 & fc==0 & year<=2014 & inrange(cps_month,11,12)
replace flag_post1=1 if year==cps_sy & fc==0 & year>2014 & inrange(cps_month,11,12)
replace flag_post1=1 if year==removal_sy+1 & fc==1 & year<=2014 & inrange(removal_month,11,12)
replace flag_post1=1 if year==removal_sy & fc==1 & year>2014 & inrange(removal_month,11,12)

gen post1_year_tmp=year if flag_post1==1
gegen post1_year=max(post1_year_tmp), by(ric inv_caseid) 
drop flag_post1 post1_year_tmp
forv i=2/5 {
	local i1=`i'-1
	gen post`i'_year=post`i1'_year+1
}

**Test scores, by How Many Years After the Investigation 
foreach subj in math reading {
	forv i=1/5 {

		**Took MEAP/MSTEP
		gen post`i'_took_`subj'_tmp=1 if `subj'stdss!=. & inrange(grade_fnl,3,8) & year==post`i'_year
		replace post`i'_took_`subj'_tmp=0 if `subj'stdss==. & inrange(grade_fnl,3,8) & year==post`i'_year
		gegen post`i'_took_`subj'=max(post`i'_took_`subj'_tmp), by(ric inv_caseid)
		drop post`i'_took_`subj'_tmp
	
		**MEAP/MSTEP score
		gen post`i'_`subj'_tmp=`subj'stdss if post`i'_took_`subj'==1 & inrange(grade_fnl,3,8) & year==post`i'_year
		gegen post`i'_`subj'=max(post`i'_`subj'_tmp), by(ric inv_caseid)
		drop post`i'_`subj'_tmp
	
		**MEAP/MSTEP proficiency
		gen post`i'_`subj'prof_tmp=`subj'prof if post`i'_took_`subj'==1 & inrange(grade_fnl,3,8) & year==post`i'_year
		gegen post`i'_`subj'prof=max(post`i'_`subj'prof_tmp), by(ric inv_caseid)
		drop post`i'_`subj'prof_tmp
	}
}

forv i=1/5 {
	la var post`i'_took_math "Took Math MEAP/MSTEP `i' Year After Investigation"
	la var post`i'_took_reading "Took Reading MEAP/MSTEP `i' Year After Investigation"	
	la var post`i'_math "Std Math Score on MEAP/MSTEP `i' Year After Investigation"
	la var post`i'_reading "Std Reading Score on MEAP/MSTEP `i' Year After Investigation"	
	la var post`i'_mathprof "Math Proficiency on MEAP/MSTEP `i' Year After Investigation"
	la var post`i'_readingprof "Reading Proficiency on MEAP/MSTEP `i' Year After Investigation" 
}
drop post1_year-post5_year

**REPEATED A GRADE
sort ric inv_caseid year
gen post_rep_grade_tmp=0
replace post_rep_grade_tmp=1 if grade_fnl==grade_fnl[_n-1] & ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & year>cps_sy
gegen post_rep_grade=max(post_rep_grade_tmp), by(ric inv_caseid)
drop post_rep_grade_tmp
la var post_rep_grade "Ever repeated grade after investigation"

sort ric inv_caseid year
forv i=1/5 {
	gen post`i'_rep_grade_tmp=.
	replace post`i'_rep_grade_tmp=1 if grade_fnl==grade_fnl[_n-1] & ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & year==cps_sy+`i'
	replace post`i'_rep_grade_tmp=0 if grade_fnl!=grade_fnl[_n-1] & ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & year==cps_sy+`i'
	gegen post`i'_rep_grade=max(post`i'_rep_grade_tmp), by(ric inv_caseid)
	drop post`i'_rep_grade_tmp
	la var post`i'_rep_grade "Repeated Grade `i' SY After Investigation"
}

**ACT/SAT exams (takes place in March)
gen post_took_actsat_tmp=(actcompositestd!=. | satcompositestd!=.) if inrange(grade_fnl,10,12) & year>cps_sy
replace post_took_actsat_tmp=(actcompositestd!=. | satcompositestd!=.) if inrange(grade_fnl,10,12) & year==cps_sy & (inrange(cps_month,9,12) | inrange(cps_month,1,2))
gegen post_took_actsat=max(post_took_actsat_tmp), by(ric inv_caseid)
drop post_took_actsat_tmp
la var post_took_actsat "Took ACT/SAT in Post Year"

gegen post_actstd_tmp=max(actcompositestd) if year>cps_sy | (year==cps_sy & (inrange(cps_month,9,12) | inrange(cps_month,1,2))), by(ric inv_caseid)
gegen post_satstd_tmp=max(satcompositestd) if year>cps_sy | (year==cps_sy & (inrange(cps_month,9,12) | inrange(cps_month,1,3))), by(ric inv_caseid)
gen post_actsatstd_tmp=post_actstd_tmp
replace post_actsatstd_tmp=post_satstd_tmp if post_actsatstd_tmp==. | (post_actsatstd_tmp!=. & post_actsatstd_tmp<post_satstd_tmp) 
gegen post_actsatstd=max(post_actsatstd_tmp), by(ric inv_caseid)
la var post_actsatstd "Standardized ACT/SAT Score in Post Years"
drop post_actsatstd_tmp post_actstd* post_satstd*

**High School Graduation
***note: could graduate high school if they would have been in 12th grade by 2017
gen post_couldgradhs_tmp=.
gegen year_max=max(year), by(ric)
gen grade_year_max=grade_fnl if year==year_max
replace post_couldgradhs_tmp=1 if 2017-year_max+grade_year_max>=12 & inrange(grade_year_max,0,12)
replace post_couldgradhs_tmp=0 if 2017-year_max+grade_year_max<12 & inrange(grade_year_max,0,12)
***note: correct for 11th graders who graduated 
replace post_couldgradhs_tmp=1 if gradhs==1
gegen post_couldgradhs=max(post_couldgradhs_tmp), by(ric inv_caseid)
replace post_couldgradhs=0 if post_couldgradhs=. //correct for grade=14 or grade=30 students//
drop post_couldgradhs_tmp year_max grade_year_max
la var post_couldgradhs "Could Have Graduated High School by 2017"

**note: use year>=cps_year here because no removals/investigations should happen AFTER
**a child graduated from high school. Therefore, we can safely include a child's 
**graduation year.
gen post_gradhs_tmp=1 if post_couldgradhs==1 & gradhs==1
replace post_gradhs_tmp=0 if post_couldgradhs==1 & gradhs==0
gegen post_gradhs=max(post_gradhs_tmp), by(ric inv_caseid)
drop post_gradhs_tmp
la var post_gradhs "Graduated High School After Investigation"

gen gradhs_year=year if gradhs==1
gen post_gradhs_ontime_tmp=1 if post_couldgradhs==1 & gradhs_year<=(cps_sy+12-cps_grade) & inrange(cps_grade,0,12)
replace post_gradhs_ontime_tmp=0 if post_couldgradhs==1 & gradhs==0
replace post_gradhs_ontime_tmp=0 if post_couldgradhs==1 & gradhs==1 & gradhs_year>(cps_sy+12-cps_grade) & inrange(cps_grade,0,12)
gegen post_gradhs_ontime=max(post_gradhs_ontime_tmp), by(ric inv_caseid)
drop post_gradhs_ontime_tmp
la var post_gradhs_ontime "Graduated HS in expected 12th grade year"

**Highest Grade Enrolled (sort of like a measure of dropout)
gen grade_new=grade_fnl if inrange(grade_fnl,0,12)
gegen highest_grade_completed_tmp=max(grade_new) if year>cps_year, by(ric inv_caseid)
gegen post_highest_grade_completed=max(highest_grade_completed_tmp), by(ric inv_caseid) 
drop grade_new highest_grade_completed_tmp
la var post_highest_grade_completed "Highest Grade Completed After Investigation"

**College Enrollment
***note: could enroll in college if they would have been in 12th grade by 2016
gen post_couldenrollcoll_tmp=.
replace post_couldenrollcoll_tmp=1 if 2016-year_max+grade_year_max>=12 & inrange(grade_year_max,0,12)
replace post_couldenrollcoll_tmp=0 if 2016-year_max+grade_year_max<12 & inrange(grade_year_max,0,12)
***note: correct for 11th graders in 2016 who graduated 
replace post_couldenrollcoll_tmp=1 if 2016-cps_sy+cps_grade==11 & gradhs==1
gegen post_couldenrollcoll=max(post_couldenrollcoll_tmp), by(ric inv_caseid)
drop post_couldenrollcoll_tmp year_max grade_year_max
la var post_couldenrollcoll "Could Have Enrolled in College by 2017"

**note: don't need any year restriction here because all investigations should occur before
**a child is in college
gen post_enrollcoll_tmp=1 if post_couldenrollcoll==1 & enrolled_college==1
replace post_enrollcoll_tmp=0 if post_couldenrollcoll==1 & enrolled_college!=1
gegen post_enrollcoll=max(post_enrollcoll_tmp), by(ric inv_caseid)
drop post_enrollcoll_tmp
la var post_enrollcoll "Enrolled in College After Investigation"

foreach x in 2 4 {
	gen post_enrollcoll`x'yr_tmp=1 if post_couldenrollcoll==1 & enrolled_college_`x'yr==1
	replace post_enrollcoll`x'yr_tmp=0 if post_couldenrollcoll==1 & enrolled_college_`x'yr!=1
	gegen post_enrollcoll`x'yr=max(post_enrollcoll`x'yr_tmp), by(ric inv_caseid)
	drop post_enrollcoll`x'yr_tmp
	la var post_enrollcoll`x'yr "Enrolled in `x' Yr College After Investigation"
}

***********************
***(2) BEHAVIORAL OUTCOMES- DISCIPLINE/CRIME
***********************

**Expulsions (note: I do not use suspensions because of data quality issues)
forv i=1/5 {
	gen post`i'_expelled_tmp=expelled if year==cps_sy+`i'
	gegen post`i'_expelled=max(post`i'_expelled_tmp), by(ric inv_caseid)
	drop post`i'_expelled_tmp
}

forv i=1/5 {
	la var post`i'_expelled "Expelled `i' SY After Investigation"
}

**Juvenile justice court hearing
preserve
import delimit using $jj/20190313-jj-valid-rics.csv, clear varn(1)
gduplicates drop match_group, force	//there are 2 match groups corresponding to more than 1 ric//
tempfile jj_link
save `jj_link'
restore

preserve
gzuse $jj/jj_filing_dates.dta.gz, clear
keep match_group case_date
gduplicates drop
merge m:1 match_group using `jj_link'
keep if _merge==3
drop _merge
drop match_group
tempfile jj_dates
save `jj_dates'
restore

preserve
keep ric inv_caseid fc cps_date removal_date cps_age cps_sy
gduplicates drop
joinby ric using `jj_dates', unmatched(none)

**Create variable for whether someone had a JJ petition filed after the investigation:
**ever, by year & by month
gen post_jj_tmp=0
replace post_jj_tmp=1 if case_date!=. & case_date>cps_date
gegen post_jj=max(post_jj_tmp), by(ric inv_caseid)
drop post_jj_tmp
la var post_jj "Ever had a JJ petition filed after the investigation"

forv i=1/5 {
	gen post`i'_jj_tmp=0
	local i1=`i'-1
	replace post`i'_jj_tmp=1 if fc==0 & inrange(case_date,cps_date+365*`i1',cps_date+365*`i')
	replace post`i'_jj_tmp=1 if fc==1 & inrange(case_date,removal_date+365*`i1',removal_date+365*`i')
	replace post`i'_jj_tmp=. if cps_sy+`i'>2015
	replace post`i'_jj_tmp=. if cps_age+`i'>16 //age of majority is 17 in Michigan//
	gegen post`i'_jj=max(post`i'_jj_tmp), by(ric inv_caseid)
	drop post`i'_jj_tmp
	la var post`i'_jj "Had a JJ Petition `i' Years After Investigation"
}

keep ric inv_caseid post*
gduplicates drop
tempfile jj_post
save `jj_post'
restore

**There are 8 counties missing JJ coverage. Keep investigations in these counties 
**missing for the JJ outcome: Ingham, Kent, Berrien, Ottawa, Delta, Kalamazoo, Washtenaw
merge m:1 ric inv_caseid using `jj_post'
foreach x in post_jj post1_jj post2_jj post3_jj post4_jj post5_jj {
	replace `x'=0 if _merge!=3
	foreach c in Ingham Kent Keweenaw Berrien Ottawa Delta Kalamazoo Washtenaw {
		replace `x'=. if worker_county=="`c' County"
	}
}
drop _merge


***********************
***(3) SCHOOL ENVIRONMENT
***********************

**Bring in enrollment dates at different schools
preserve
rename complaint_date cps_date
keep ric inv_caseid cps_date cps_sy cps_grade fc removal_date removal_sy fc_enddt 
gegen tag=tag(ric inv_caseid)
keep if tag==1
drop tag
tempfile cw
save `cw'

use "$cleandata/school_list_full.dta", clear
joinby ric using `cw', unmatched(none)

*****# DIFFERENT SCHOOL TRANSITIONS*****
**first identify the school that child attended during the investigation (use first obs after
**CPS SY if they were not in school)
gen flag=0
replace flag=1 if cps_date>enrolldate & fc==0
replace flag=1 if removal_date>enrolldate & fc==1
gegen first=max(enrolldate) if flag==1, by(ric inv_caseid)
gen cps_school=first==enrolldate
gegen flag_max=max(flag), by(ric inv_caseid)
bysort ric inv_caseid: replace cps_school=1 if _n==1 & flag_max==0

gen post_n_schools_tmp=n_later_schools if cps_school==1
gegen post_n_schools=max(post_n_schools_tmp), by(ric inv_caseid)
drop post_n_schools_tmp flag flag_max first 
la var post_n_schools "# School Transitions after Investigation"

**clean up dual enrollment scenarios by getting rid of schools that a student enrolled
**in at the same time as another school if they were enrolled in the other school
**in the previous year. Analogous for the next year also.
gduplicates tag ric inv_caseid year enrolldate, gen(dups)
gen enrolled_lastyear=0
bysort ric inv_caseid: replace enrolled_lastyear=1 if _n==1
sort ric inv_caseid bcode year
replace enrolled_lastyear=1 if ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & year==year[_n-1]+1 & bcode==bcode[_n-1]
gegen enrolled_lastyear_sum=sum(enrolled_lastyear), by(ric inv_caseid year enrolldate)
drop if dups==1 & enrolled_lastyear==0 & enrolled_lastyear_sum>0
drop dups enrolled_lastyear*

gduplicates tag ric inv_caseid year enrolldate, gen(dups)
sort ric inv_caseid year bcode
gen enrolled_nextyear=0
bysort ric inv_caseid: replace enrolled_nextyear=1 if _n==_N
sort ric inv_caseid bcode year
replace enrolled_nextyear=1 if ric==ric[_n+1] & inv_caseid==inv_caseid[_n+1] & year==year[_n+1]-1 & bcode==bcode[_n+1]
gegen enrolled_nextyear_sum=sum(enrolled_nextyear), by(ric inv_caseid year enrolldate)
drop if dups==1 & enrolled_nextyear==0 & enrolled_nextyear_sum>0
drop dups enrolled_next*

set seed 1234
gen random=runiform()
gduplicates tag ric inv_caseid year enrolldate, gen(dups)
gegen random_max=max(random) if dups>0, by(ric inv_caseid year enrolldate)
drop if random!=random_max & dups>0
drop dups random*

**Change the enrolldate variable to be the first date that a child enrolled in that 
**school if it was just a consecutive enrollment
sort ric inv_caseid year enrolldate
replace enrolldate=enrolldate[_n-1] if ric==ric[_n-1] & year==year[_n-1]+1 & bcode==bcode[_n-1] 

gen infc_diff_school=0
replace infc_diff_school=1 if fc==1 & removal_date<=enrolldate & (fc_enddt>enrolldate | fc_enddt==.)

**Create a binary indicator at the student*year level for whether a student had a transition
**that year
sort ric inv_caseid year enrolldate
gen diff_school_tmp=1 if ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & enrolldate!=enrolldate[_n-1]
replace diff_school_tmp=0 if ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & enrolldate==enrolldate[_n-1]
gegen diff_school=max(diff_school_tmp), by(ric inv_caseid year)
preserve
keep ric inv_caseid year diff_school
gduplicates drop
save "$cleandata/student_year_mobility.dta", replace
restore

*****ATTENDED DIFFERENT SCHOOL*****
**Had a school transition in post 1 means there should be an enrolldate within 1 year
**after investigation. Had a transition in post 2 means there should be an enrolldate
**between 1 year later and 2 years later. I am careful to not include the first school
**that a student attends as a school transition.
gegen enrolldate_min=min(enrolldate), by(ric)

****(a) Annual
forv i=1/5 {
	gen post`i'_diff_school_tmp=0
	local i1=`i'-1
	replace post`i'_diff_school_tmp=1 if fc==0 & inrange(enrolldate,cps_date+365*`i1',cps_date+365*`i')
	replace post`i'_diff_school_tmp=1 if fc==1 & inrange(enrolldate,removal_date+365*`i1',removal_date+365*`i')
	replace post`i'_diff_school_tmp=. if enrolldate==enrolldate_min
	replace post`i'_diff_school_tmp=. if cps_sy+`i'>2017
	replace post`i'_diff_school_tmp=. if post`i'_diff_school==0 & inrange(cps_grade,0,14) & cps_grade+`i'>12
	gegen post`i'_diff_school=max(post`i'_diff_school_tmp), by(ric inv_caseid)
	
	drop post`i'_diff_school_tmp 
	
	la var post`i'_diff_school "Switched Schools Between `i1' and `i' Years after Investigation"
}

****(b) Monthly
forv i=1/60 {
	gen postm`i'_diff_school_tmp=0
	local i1=`i'-1
	replace postm`i'_diff_school_tmp=1 if fc==0 & inrange(enrolldate,cps_date+30*`i1',cps_date+30*`i')
	replace postm`i'_diff_school_tmp=1 if fc==1 & inrange(enrolldate,removal_date+30*`i1',removal_date+30*`i')
	replace postm`i'_diff_school_tmp=. if enrolldate==enrolldate_min
	**Set to missing if post month is beyond the sample period
	replace postm`i'_diff_school_tmp=. if fc==0 & cps_date+30*`i'>date("6/1/2017","MDY")	
	replace postm`i'_diff_school_tmp=. if fc==1 & removal_date+30*`i'>date("6/1/2017","MDY")
	replace postm`i'_diff_school_tmp=. if postm`i'_diff_school==0 & inrange(cps_grade,0,14) & cps_grade+(`i'/12)>12	//this is a really rough workaround//
	gegen postm`i'_diff_school=max(postm`i'_diff_school_tmp), by(ric inv_caseid)
	drop postm`i'_diff_school_tmp 
	la var postm`i'_diff_school "Switched Schools Between `i1' and `i' Months after Investigation"
}

gegen tag=tag(ric inv_caseid)
keep if tag==1
keep ric inv_caseid post*
tempfile nsch
save `nsch'
restore

merge m:1 ric inv_caseid using `nsch'
drop _merge

****(c) Cumulative monthly measure of school mobility
gen postm1_diff_school_cumu=postm1_diff_school
la var postm1_diff_school_cumu "# School Transitions After 1 Month"
forv i=2/60 {
	local i1=`i'-1
	gen postm`i'_diff_school_cumu=postm`i1'_diff_school_cumu+postm`i'_diff_school
	la var postm`i'_diff_school_cumu "# School Transitions After `i' Months"
}
	
*****SCHOOL CHARACTERISTICS IN YEARS AFTER INVESTIGATION*****
gen sch_town_rural=urbanicity==3
foreach x in urban suburb town_rural {
	replace sch_`x'=. if urbanicity==.
}

rename puptch sch_puptch
rename pupaide sch_pupaide
rename median_experience sch_median_exp
rename mn_math sch_math
rename mn_reading sch_reading

foreach x in charter sch_size sch_white sch_black sch_hisp sch_frpl ///
	sch_urban sch_suburb sch_town_rural sch_puptch sch_median_exp sch_math sch_reading {
	
	forv i=1/5 {
		gen post`i'_`x'_tmp=`x' if year==cps_sy+`i'
		gegen post`i'_`x'=max(post`i'_`x'_tmp), by(ric inv_caseid)
		drop post`i'_`x'_tmp
	}
}

forv i=1/5 {
	la var post`i'_sch_urban "Urban School `i' SY After Investigation"
	la var post`i'_sch_suburb "Urban School `i' SY After Investigation"
	la var post`i'_sch_town_rural "Town/Rural School `i' SY After Investigation"
	la var post`i'_sch_puptch "Pupil Teacher Ratio in School `i' SY After Investigation"
	la var post`i'_sch_median_exp "Median Teacher Exp in School `i' SY After Investigation"
	la var post`i'_sch_math "Avg Math Score in School `i' SY After Investigation"
	la var post`i'_sch_reading "Avg Reading Score in School `i' SY After Investigation"
	la var post`i'_charter "Charter School `i' SY After Investigation"
	la var post`i'_sch_size "# Students in School `i' SY After Investigation"
	la var post`i'_sch_white "% White in School `i'SY After Investigation"
	la var post`i'_sch_black "% Black in School `i' SY After Investigation" 
	la var post`i'_sch_hisp "% Hispanic in School `i' SY After Investigation"
	la var post`i'_sch_frpl "% FRPL in School `i' SY After Investigation" 

}

***********************
***(4) NEIGHBORHOOD ENVIRONMENT
***********************

**Bring in enrollment dates at different neighborhoods
preserve
rename complaint_date cps_date
keep ric inv_caseid cps_date cps_grade cps_sy fc removal_date removal_sy fc_enddt 
gegen tag=tag(ric inv_caseid)
keep if tag==1
drop tag
tempfile cw
save `cw'

use "$cleandata/censusblock_list_full.dta", clear
joinby ric using `cw', unmatched(none)

*****# DIFFERENT NBHD TRANSITIONS*****
**first identify the neighborhood that child lived in during the investigation (use first obs after
**CPS SY if they were not in school)
gen flag=0
replace flag=1 if cps_date>enrolldate & fc==0
replace flag=1 if removal_date>enrolldate & fc==1
gegen first=max(enrolldate) if flag==1, by(ric inv_caseid)
gen cps_nbhd=first==enrolldate
gegen flag_max=max(flag), by(ric inv_caseid)
bysort ric inv_caseid: replace cps_nbhd=1 if _n==1 & flag_max==0

gen post_n_cb_tmp=n_later_cb if cps_nbhd==1
gegen post_n_cb=max(post_n_cb_tmp), by(ric inv_caseid)
drop post_n_cb_tmp flag flag_max first 
la var post_n_cb "# Nbhd Transitions after Investigation"

**clean up dual enrollment scenarios (eg. student was enrolled in 2 schools at the same
**time and they had conflicting address information)
gduplicates tag ric inv_caseid year enrolldate, gen(dups)

gen resided_lastyear=0
bysort ric inv_caseid: replace resided_lastyear=1 if _n==1
sort ric inv_caseid censusblock year
replace resided_lastyear=1 if ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & year==year[_n-1]+1 & censusblock==censusblock[_n-1]
gegen resided_lastyear_sum=sum(resided_lastyear), by(ric inv_caseid year enrolldate)
drop if dups==1 & resided_lastyear==0 & resided_lastyear_sum>0
drop dups resided_lastyear*

gduplicates tag ric inv_caseid year enrolldate, gen(dups)
sort ric inv_caseid year censusblock
gen resided_nextyear=0
bysort ric inv_caseid: replace resided_nextyear=1 if _n==_N
sort ric inv_caseid censusblock year
replace resided_nextyear=1 if ric==ric[_n+1] & inv_caseid==inv_caseid[_n+1] & year==year[_n+1]-1 & censusblock==censusblock[_n+1]
gegen resided_nextyear_sum=sum(resided_nextyear), by(ric inv_caseid year enrolldate)
drop if dups==1 & resided_nextyear==0 & resided_nextyear_sum>0
drop dups resided_next*

set seed 1234
gen random=runiform()
gduplicates tag ric inv_caseid year enrolldate, gen(dups)
gegen random_max=max(random) if dups>0, by(ric inv_caseid year enrolldate)
drop if random!=random_max & dups>0
drop dups random*

**Change the enrolldate variable to be the first date that a child enrolled in that 
**school if it was just a consecutive enrollment
sort ric inv_caseid year enrolldate
replace enrolldate=enrolldate[_n-1] if ric==ric[_n-1] & year==year[_n-1]+1 & censusblock==censusblock[_n-1] 

gen infc=0
replace infc=1 if fc==1 & removal_date<=enrolldate & (fc_enddt>enrolldate | fc_enddt==.)
gen notfc=infc==0

**Create a binary indicator at the student*year level for whether a student had a transition
**that year
sort ric inv_caseid year enrolldate
drop if enrolldate==.
gen diff_cb_tmp=1 if ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & censusblock!=censusblock[_n-1]
replace diff_cb_tmp=0 if ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & censusblock==censusblock[_n-1]
gegen diff_cb=max(diff_cb_tmp), by(ric inv_caseid year)
preserve
keep ric inv_caseid year diff_cb
gduplicates drop
tempfile cb
save `cb'
use "$cleandata/student_year_mobility.dta", clear
merge 1:1 ric year inv_caseid using `cb'
drop _merge
save "$cleandata/student_year_mobility.dta", replace
restore

*****LIVED IN DIFFERENT NEIGHBORHOOD*****
**Had a nbhd transition in post 1 means there should be an enrolldate within 1 year
**after investigation. Had a transition in post 2 means there should be an enrolldate
**between 1 year later and 2 years later. I am careful to not include the first nbhd
**that a student lives in as a transition.
gegen enrolldate_min=min(enrolldate), by(ric)
forv i=1/5 {
	gen post`i'_diff_cb_tmp=0
	local i1=`i'-1
	replace post`i'_diff_cb_tmp=1 if fc==0 & inrange(enrolldate,cps_date+365*`i1',cps_date+365*`i')
	replace post`i'_diff_cb_tmp=1 if fc==1 & inrange(enrolldate,removal_date+365*`i1',removal_date+365*`i')
	replace post`i'_diff_cb_tmp=. if enrolldate==enrolldate_min
	replace post`i'_diff_cb_tmp=. if cps_sy+`i'>2017
	replace post`i'_diff_cb_tmp=. if post`i'_diff_cb==0 & inrange(cps_grade,0,14) & cps_grade+`i'>12

	gegen post`i'_diff_cb=max(post`i'_diff_cb_tmp), by(ric inv_caseid)
	drop post`i'_diff_cb_tmp 
	la var post`i'_diff_cb "Switched Nbhd's Between `i1' and `i' Years after Investigation"
}

gegen tag=tag(ric inv_caseid)
keep if tag==1
keep ric inv_caseid post*
tempfile cb
save `cb'
restore

merge m:1 ric inv_caseid using `cb'
drop _merge

replace nbhd_medinc=nbhd_medinc/1000

*****MEDIAN INCOME, EMPLOYMENT RATE, % BA OR HIGHER, RACIAL COMPOSITION IN NBHD, HOMELESS*****
foreach x in nbhd_medinc nbhd_emp nbhd_bapl nbhd_white nbhd_black nbhd_hisp homeless {
	forv i=1/5 {
		gen post`i'_`x'_tmp=`x' if year==cps_sy+`i'
		gegen post`i'_`x'=max(post`i'_`x'_tmp), by(ric inv_caseid)
		drop post`i'_`x'_tmp
		
		if "`x'"!="nbhd_medinc" & "`x'"!="homeless" {
			replace post`i'_`x'=post`i'_`x'/100
		}
	}
}

forv i=1/5 {
	la var post`i'_nbhd_medinc "HH Median Income in CBG `i' SY After Investigation"
	la var post`i'_nbhd_emp "Employment Rate in CBG `i' SY After Investigation"
	la var post`i'_nbhd_bapl "% BA or Higher in CBG `i' SY After Investigation"
	la var post`i'_nbhd_white "% White in CBG `i' SY After Investigation"
	la var post`i'_nbhd_black "% Black in CBG `i' SY After Investigation"
	la var post`i'_nbhd_hisp "% Hispanic in CBG `i' SY After Investigation"
	la var post`i'_homeless "Homeless `i' SY After Investigation"
}

**Label Variables
la var mathpl_post2011 "Proficiency Level on Math Test (post 2011 standards)"
la var readingpl_post2011 "Proficiency Level on Reading Test (post 2011 standards)"
la var censusblockgroup "Census Block Group"
la var attend "Attendance Rate"
la var sped "Special Education"
la var derived_exitstatus "SRSD Exit Status Derived from Data"
la var poor "Free/Reduced Price Lunch or Poverty Flag Indicator"
la var chronic_abs "Attendance Rate Less Than 90%"
la var mathprof "Proficient on Math Test (post 2011 standards)"
la var readingprof "Proficient on Reading Test (post 2011 standards)"
la var nbhd_bapl "% BA or Higher in Census Block Group"

*****************************
**CREATE OUTCOME MEASURES THAT CHILDREN WHO DO NOT MATCH TO A RIC ARE STILL
**ELIGIBLE FOR
*****************************

tempfile postvars
save `postvars'

**Bring in Information From Master DHHS Data
use "${cleandata}analysis_sample.dta", clear
tempfile sample
save `sample'

use "${cleandata}master_dhhs.dta", clear
merge 1:1 vicid inv_caseid using `sample'
keep if _merge==3
drop _merge
sort vicid inv_caseid
joinby vicid inv_caseid using `postvars', unmatched(both)
drop _merge

**Matched to the education data
gen hasric=(ric!=.)
la var hasric "Matched to MCER Data"

**AGE DURING INVESTIGATION
drop cps_age
personage dob complaint_date, gen(age_inv)
replace age_inv=0 if age<0 & age!=.
replace age_inv=17 if age>18 & age!=.
la var age_inv "Age during Investigation"

*****ENROLLED IN A MICHIGAN PUBLIC SCHOOL IN YEARS AFTER Investigation
**note: need to create a cps_sy variable for children without rics. Assume
**the first day of school was about Sept 5.
drop cps_year cps_date cps_month
rename complaint_year cps_year
gen cps_month=month(complaint_date)
replace cps_sy=cps_year if cps_sy==. & inrange(cps_month,1,8)
replace cps_sy=cps_year if cps_sy==. & cps_month==9 & day(complaint_date)<5
replace cps_sy=cps_year+1 if cps_sy==. & cps_month==9 & day(complaint_date)>=5
replace cps_sy=cps_year+1 if cps_sy==. & cps_month>=10

gen post_enrolled_ever_tmp=0
replace post_enrolled_ever_tmp=1 if year>cps_sy & year!=.
gegen post_enrolled_ever=max(post_enrolled_ever_tmp), by(vicid inv_caseid)
drop post_enrolled_ever_tmp
la var post_enrolled_ever "Ever enrolled in a Michigan public school after inv"

forv i=1/5 {
	gen post`i'_enrolled_tmp=0 
	replace post`i'_enrolled_tmp=1 if year==cps_sy+`i' 
	gegen post`i'_enrolled=max(post`i'_enrolled_tmp), by(vicid inv_caseid)
	replace post`i'_enrolled=. if cps_sy+`i'>2017
	replace post`i'_enrolled=. if post`i'_enrolled==0 & hasric==1 & cps_grade+`i'>12 & inrange(cps_grade,0,12)
	la var post`i'_enrolled "Enrolled in a Michigan Public School `i' SY After Investigation"
	drop post`i'_enrolled_tmp
}

*****FILL IN THE NUMBER OF PRIOR INVESTIGATIONS FOR STUDENTS WITHOUT A RIC
replace pre_n_prior_inv=n_prior_inv if hasric==0
replace pre_n_prior_inv=0 if pre_n_prior_inv==.
***note: trim outliers
sum pre_n_prior_inv, d
replace pre_n_prior_inv=r(p95) if pre_n_prior_inv>r(p95)

tempfile postvars
save `postvars'

*****FUTURE MALTREATMENT OUTCOMES
***(a) Annual
use "$cleandata/future_maltreatment.dta", clear
local keepvars vicid inv_caseid post_inv post_sub_inv post_n_inv post_n_sub_inv
foreach x in inv sub_inv {
	forv i=1/5 {
		local keepvars `keepvars' post`i'_`x'
	}
}
keep `keepvars'
merge 1:m vicid inv_caseid using `postvars'
drop if _merge==1
drop _merge
foreach x in inv sub_inv {
	forv i=1/5 {
		replace post`i'_`x'=. if cps_sy+`i'>2017
		replace post`i'_`x'=. if age_inv+`i'>17
	}
}

***(b) Monthly
preserve
use "${cleandata}master_dhhs.dta", clear
replace prep=1 if fc==1
replace prep=0 if prep==.
gen parent=(mom==1 | dad==1 | parent_unkn==1)
keep vicid complaint_date prep parent
rename complaint_date complaint_date_dhhs
rename preponderance preponderance_dhhs
tempfile dhhs
save `dhhs'
restore

preserve
keep vicid inv_caseid fc complaint_date removal_date cps_grade age_inv
gduplicates drop vicid inv_caseid, force
joinby vicid using `dhhs', unmatched(master)
replace complaint_date_dhhs=. if complaint_date_dhhs==complaint_date	//do NOT count the initial complaint as future maltreatment//
drop _merge
forv i=1/108 {
	gen postm`i'_inv_tmp=0
	gen postm`i'_inv_sub_tmp=0
	
	local i1=`i'-1
	replace postm`i'_inv_tmp=1 if fc==0 & inrange(complaint_date_dhhs,complaint_date+30*`i1',complaint_date+30*`i')
	replace postm`i'_inv_tmp=1 if fc==1 & inrange(complaint_date_dhhs,removal_date+30*`i1',removal_date+30*`i')
	
	replace postm`i'_inv_sub_tmp=1 if fc==0 & preponderance_dhhs==1 & inrange(complaint_date_dhhs,complaint_date+30*`i1',complaint_date+30*`i')
	replace postm`i'_inv_sub_tmp=1 if fc==1 & preponderance_dhhs==1 & inrange(complaint_date_dhhs,removal_date+30*`i1',removal_date+30*`i')
	
	
	**Set to missing if post month is beyond the sample period
	foreach x in inv inv_sub {
		replace postm`i'_`x'_tmp=. if fc==0 & complaint_date+30*`i'>date("6/1/2017","MDY")	
		replace postm`i'_`x'_tmp=. if fc==1 & removal_date+30*`i'>date("6/1/2017","MDY")
		replace postm`i'_`x'_tmp=. if postm`i'_`x'_tmp==0 & age_inv+(`i'/12)>17	//this is a really rough workaround//
		gegen postm`i'_`x'=max(postm`i'_`x'_tmp), by(vicid inv_caseid)
		drop postm`i'_`x'_tmp
	}
	
	la var postm`i'_inv "Had an investigation `i' months after original one"
	la var postm`i'_inv_sub "Had a sub investigation `i' months after original one"
}
keep vicid inv_caseid postm*
gduplicates drop vicid inv_caseid, force
tempfile postm_inv
save `postm_inv'
restore
merge m:1 vicid inv_caseid using `postm_inv'
drop _merge

****Cumulative monthly measure of future maltreatment
foreach x in inv inv_sub {
	gen postm1_`x'_cumu=postm1_`x'
	forv i=2/108 {
		local i1=`i'-1
		gen postm`i'_`x'_cumu=postm`i1'_`x'_cumu+postm`i'_`x'
	}
}
forv i=1/108 {
	la var postm`i'_inv_cumu "# Investigations `i' months after original one"
	la var postm`i'_inv_sub_cumu "# Sub Investigations `i' months after original one"
}

**Save Master Dataset As a Child*Inv*SY panel
order vicid inv_caseid year grade_fnl bcode_weight cps_year fc pre* post* 
sort vicid inv_caseid year 
compress
save "$cleandata/master_child_panel_withpost.dta", replace









